# ----------------------------------------------------------
# Estimating installation price as function of panels
# ----------------------------------------------------------
# Loading packages
library(pacman)
p_load(
  data.table, here, lubridate, tidyverse, hrbrthemes,
  tigris, sf, DescTools, fixest, janitor,
  priceR
)

# Some settings 
theme_set(theme_minimal())
options(tigris_use_cache = TRUE)

# Reading in the raw data
install_dt_raw = rbind(
  fread(here("Data/LBNL-pv-installations/TTS_LBNL_public_file_14-Dec-2021_p1.csv")),
  fread(here("Data/LBNL-pv-installations/TTS_LBNL_public_file_14-Dec-2021_p2.csv"))
)

# Getting state fips codes and census divisions 
fips_dt = states(cb = TRUE) |> data.table() %>% 
  .[,.(state_fips = STATEFP, state = STUSPS, name = NAME)]

region_dt = fread(here("Data/census-regions.csv")) |>
  setnames(
    old = c("State","State Code","Region","Division"), 
    new = c("name","state","region","division")
  )

# Replacing all -1 with NA
install_dt_raw[install_dt_raw == -1] = NA

# Filtering to residential and 2000-2020 
install_dt = 
  install_dt_raw[
    customerSegment == "RES" & year(dmy(PTODate_orProxy_)) %in% 2000:2020,
    .(
      install_date = dmy(PTODate_orProxy_),
      zipcode = hostCustomerZip__4_,
      city = hostCustomerCity,
      state,
      system_size_dc = systemSizeInDCSTC_KW_,
      total_cost = totalInstalledCost___,
      subsidy = Up_FrontCashIncentive___,
      cost_per_kw = totalInstalledCost___/systemSizeInDCSTC_KW_,
      total_panels = coalesce(moduleQty_1,0) + coalesce(moduleQty_2,0) + coalesce(moduleQty_3,0) 
    )
  ] |>
  merge(
    fips_dt, 
    by = "state",
    all.x = T
  ) |>
  merge(
    region_dt, 
    by = "state",
    all.x = T
  )

# Winsorizing because there are a few outliers  
win_pr = 0.01
pr = c(win_pr/2,1-win_pr/2)

install_dt[,':='(
  year = year(install_date) |> as.character(),
  total_cost_w = Winsorize(total_cost, na.rm = TRUE, probs = pr),
  total_panels_w = Winsorize(total_panels, na.rm = TRUE, probs = pr)
)]

# Converting to 2014 dollars 
us_inflation_dt = retrieve_inflation_data('US') |> data.table()

price_index_dt = 
  rbind( 
    # First pre-2014 years
    setorder(us_inflation_dt, -date)[
      date >= min(year(install_dt$install_date)) & date <=2014,
      .(year = date, 
        inflation = value, 
        multiplier = cumprod((1+data.table::shift(value, fill =0)/100))
    )],
    # Now after 2014
    setorder(us_inflation_dt, date)[date > 2014,
      .(year = date,
        inflation = value, 
        multiplier = 1/cumprod(1+value/100)
    )] 
  ) |> setorder(-year)
install_dt =
  merge(
    install_dt[,-c('inflation','multiplier')], 
    price_index_dt,
    by = 'year',
    all.x = TRUE
  )
# Converting to 2014 dollars
install_dt[,':='(
  total_cost_w_real = total_cost_w*multiplier,
  subsidy_real = subsidy*multiplier
)]

# Estimating a model for each state
price_mods_state = feols(
  data = install_dt[
    !is.na(total_cost_w_real) & 
    !is.na(total_panels_w) & 
    total_panels > 0 & 
    year %in% 2014:2018
  ],
  fml = total_cost_w_real ~ total_panels_w + sw0(I(as.numeric(year)-2017)),  #-1  + i(year, total_panels_w),
  fsplit = ~region,
  se= 'het'
)
# Printing the results 
etable(price_mods_state)
etable(
  price_mods_state[rhs = 'year'],
  tex = TRUE,
  label = "tab:price-reg",
  title = "Solar system installation prices",
  dict = c(
    total_cost_w_real ="Total Cost",
    total_panels_w = "Number of Panels",
    region = 'Census Region', 
    `I(as.numeric(year)-2017)` = 'Year (Relative to 2017)'
  ),
  digits = "r1",
  digits.stats = "r2",
  notes = "Sample limited to between 2014 and 2018, prices measured in 2014 dollars.",
  file = here('Draft/Figures/tracking-the-sun-price-reg-2014-2018.tex'),
  replace = TRUE
)

# Gathering the results into a table
region_results_dt = 
  map_dfr(
    unique(region_dt$region),
    \(reg){
      price_mods_state[sample = reg, rhs = 'year'][[1]] |> 
        broom::tidy() |>
        mutate(region = reg) |>
        clean_names() |>
        filter(term %in% c('(Intercept)', 'total_panels_w'))
    }
  ) |>
  mutate(name = ifelse(term == "(Intercept)","intercept","panels")) |>
  pivot_wider(
    id_cols = region,
    names_from = name, 
    values_from = c("estimate", "std_error")
  ) |>
  data.table() 

# Turning into state level: Excluding DC
state_results_dt = 
  merge(
    region_dt[state != "DC"],
    fips_dt[,.(state,state_fips)],
    by = "state"
  ) |>
  merge(
    region_results_dt,
    by = "region"
  ) |>
  merge( # Adding number of observations
    install_dt[
      !is.na(total_cost_w_real) & 
      !is.na(total_panels_w) & 
      total_panels > 0 & 
      year %in% 2014:2018,
      .(num_obs = .N), 
      by=state
    ],
    by = "state",
    all.x = TRUE
  )

# Exporting the results 
fwrite(
  state_results_dt[order(state_fips)], 
  file = here("Data/CleanedData/state-install-price.csv")
)

# Plotting these 
panels_price_p = 
  install_dt[
    !is.na(total_cost_w_real) 
    & !is.na(total_panels_w) 
    & total_panels > 0 & 
    year %in% 2017
  ][sample(.N,1e5)] |>
  ggplot(aes(x= total_panels_w, y = total_cost_w_real/1e3)) +
  #geom_point(color = 'gray', alpha = 0.01) +
  geom_smooth(method = "lm", color = 'black', linetype = 'dashed', se = FALSE) +
  geom_smooth(method = "gam") +
  labs(
    #title = "Relationship between panels and installation price",
    x = "Total Panels",
    y = "Total Cost ($1000)"
  ) +
  facet_wrap(~region)
ggsave(
  panels_price_p,
  filename = here("figures/panels-price-region.jpeg"),
  width = 7, height = 5
)
panels_price_state_p = 
  install_dt[!is.na(total_cost_w_real) & !is.na(total_panels_w) & total_panels > 0] |>
  ggplot(aes(x= total_panels_w, y = total_cost_w_real/1e3, color = state)) +
  geom_smooth(method = "gam") + 
  labs(
    title = "Relationship between panels and installation price",
    x = "Total Panels",
    y = "Total Cost ('000 USD)",
    caption = "Data filtered to include residential installations that have reported cost and panel counts between 2014 and 2018. Cost converted to 2014 USD. 
    Installations with zero reported panels are also removed. One percent of extreme values are winsorized."
  ) +
  scale_color_viridis_d(name = "State")
ggsave(
  panels_price_state_p,
  filename = here("figures/panels-price-state.jpeg"),
  width = 10, height = 7
)

# # What is average price? 
# install_dt[
#   !is.na(total_cost_w_real) & 
#   !is.na(total_panels_w) & 
#   #total_panels_w == total_panels & 
#   total_panels > 0 ,
#   .(
#     count = .N,
#     avg_cost = mean(total_cost_w_real), 
#     avg_panels = mean(total_panels_w)
#   ),
#   keyby = year
# ]

# # Distribution of panels 
# ggplot(
#   data = install_dt[
#   !is.na(total_cost_w_real) & 
#   !is.na(total_panels_w) & 
#   total_panels > 0
#   ], 
#   aes(x = total_panels_w)
# ) + 
# geom_histogram(bins = 30) + 
# geom_vline(xintercept = 15, linetype = 'dashed')